Data

acc1_cancer_cells = readRDS("./Data/acc1_cancer_cells_15KnCount_V3.RDS")
all_acc_cancer_cells = readRDS("./Data/acc_cancer_cells_V3.RDS")
acc_all_cells = readRDS("./Data/acc_tpm_nCount_mito_no146_15k_with_ACC1_.RDS")
luminal_pathways = c("CHARAFE_BREAST_CANCER_LUMINAL_VS_BASAL_DN","CHARAFE_BREAST_CANCER_LUMINAL_VS_BASAL_UP","CHARAFE_BREAST_CANCER_LUMINAL_VS_MESENCHYMAL_DN","CHARAFE_BREAST_CANCER_LUMINAL_VS_MESENCHYMAL_UP","HUPER_BREAST_BASAL_VS_LUMINAL_DN","LIM_MAMMARY_LUMINAL_PROGENITOR_UP","SMID_BREAST_CANCER_LUMINAL_B_UP" )

# add luminal pathways
luminal_gs = msigdbr(species = "Homo sapiens") %>%as.data.frame() %>% dplyr::filter(gs_name %in% luminal_pathways)%>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()

Parameters

suffix = "01_25"
data_to_read = ""

functions

source_from_github(repositoy = "DEG_functions",version = "0.2.24")
source_from_github(repositoy = "HMSC_functions",version = "0.1.11",script_name = "functions.R")
source_from_github(repositoy = "cNMF_functions",version = "0.3.53",script_name = "cnmf_function_Harmony.R")

Enrichment analysis HMSC vs ACC

patient.ident = all_acc_cancer_cells$patient.ident %>% as.data.frame()
patient.ident[,1] = as.character(patient.ident[,1])
patient.ident[patient.ident[,1] == "ACC1",] = "HMSC"
patient.ident[,1] = as.factor(patient.ident[,1])
all_acc_cancer_cells = AddMetaData(object = all_acc_cancer_cells,metadata = patient.ident,col.name = "patient.ident")
all_acc_cancer_cells = SetIdent(all_acc_cancer_cells, value ="patient.ident")
acc_deg <- FindMarkers(all_acc_cancer_cells, ident.1 = "HMSC",logfc.threshold = 1.5,features = VariableFeatures(all_acc_cancer_cells))
enrichment_analysis(acc_deg,background = VariableFeatures(all_acc_cancer_cells),fdr_Cutoff = 0.01,ident.1 = "HMSC",ident.2 = "ACC",show_by = 1)

cell cycle filtering

hallmark_name = "GO_MITOTIC_CELL_CYCLE"
genesets  =getGmt("./Data/h.all.v7.0.symbols.pluscc.gmt")
var_features=all_acc_cancer_cells@assays$RNA@var.features
geneIds= genesets[[hallmark_name]]@geneIds
score <- apply(all_acc_cancer_cells@assays$RNA@scale.data[intersect(geneIds,var_features),],2,mean)
all_acc_cancer_cells=AddMetaData(all_acc_cancer_cells,score,hallmark_name)

#filter:
all_acc_cancer_cells_ccFiltered=all_acc_cancer_cells[,all_acc_cancer_cells@meta.data[[hallmark_name]]< 0.3]


min_threshold = min(all_acc_cancer_cells$GO_MITOTIC_CELL_CYCLE)
max_threshold = max(all_acc_cancer_cells$GO_MITOTIC_CELL_CYCLE)

Before cc filtering

FeaturePlot(object = all_acc_cancer_cells,features = hallmark_name) + ggtitle("Before cc filtering")  & scale_color_gradientn(colours = plasma(n = 10, direction = -1), limits = c(min_threshold, max_threshold))
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.

After cc filtering

FeaturePlot(object = all_acc_cancer_cells_ccFiltered,features = hallmark_name) + ggtitle("After cc filtering") & scale_color_gradientn(colours = plasma(n = 10, direction = -1), limits = c(min_threshold, max_threshold))
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.

Enrichment analysis filtered HMSC vs ACC

patient.ident = all_acc_cancer_cells_ccFiltered$patient.ident %>% as.data.frame()
patient.ident[,1] = as.character(patient.ident[,1])
patient.ident[patient.ident[,1] == "ACC1",] = "HMSC"
patient.ident[,1] = as.factor(patient.ident[,1])
all_acc_cancer_cells_ccFiltered = AddMetaData(object = all_acc_cancer_cells_ccFiltered,metadata = patient.ident,col.name = "patient.ident")
all_acc_cancer_cells_ccFiltered = SetIdent(all_acc_cancer_cells_ccFiltered, value ="patient.ident")
acc_deg <- FindMarkers(all_acc_cancer_cells_ccFiltered, ident.1 = "HMSC",logfc.threshold = 1.5,features = VariableFeatures(all_acc_cancer_cells_ccFiltered))
enrichment_analysis(acc_deg,background = VariableFeatures(all_acc_cancer_cells_ccFiltered),fdr_Cutoff = 0.01,ident.1 = "HMSC",ident.2 = "ACC",show_by = 1)

MYB expression

all_acc_cancer_cells = SetIdent(object = all_acc_cancer_cells,value = "patient.ident") #active snn graph
FeaturePlot(object = all_acc_cancer_cells,features = "MYB",label = T)

CNV

new.cluster.ids <- c("cancer", #0
                     "cancer", #1
                     "CAF", #2
                     "cancer", #3
                     "Endothelial", #4
                     "cancer", #5
                     "cancer", #6
                     "CAF", #7
                     "CAF", #8
                     "CAF", #9
                     "cancer", #10
                     "CAF", #11
                     "cancer", #12
                     "cancer", #13
                     "cancer", #14
                     "cancer", #15
                     "cancer", #16
                     "WBC", #17
                     "CAF" #18
                     )
#rename idents:
acc_all_cells = SetIdent(object = acc_all_cells,value = "RNA_snn_res.1") #active snn graph
names(new.cluster.ids) <- levels(acc_all_cells) #add snn graph levels to new.cluster.ids
acc_all_cells@meta.data[["seurat_clusters"]] = acc_all_cells@meta.data[["RNA_snn_res.1"]]
acc_all_cells = SetIdent(object = acc_all_cells,value = "seurat_clusters")
acc_all_cells <- RenameIdents(acc_all_cells, new.cluster.ids) 

# divide "cancer" into patients:
cell_types = acc_all_cells@active.ident %>% as.data.frame()
cell_types[,1]<- as.character(cell_types[,1])
cell_types = cbind(cell_types,acc_all_cells$patient.ident) %>% setnames(old = names(.), 
         new = c('cell_type','patient'))
cell_types[cell_types$cell_type == "cancer",] = cell_types[cell_types$cell_type == "cancer",2]


# hmsc_rows = (startsWith(x = rownames(cell_types),prefix = "ACC.plate2") | startsWith(x = rownames(cell_types),prefix = "ACC1.")) & cell_types[,1] == "cancer" 
# acc_rows = !(startsWith(x = rownames(cell_types),prefix = "ACC.plate2") | startsWith(x = rownames(cell_types),prefix = "ACC1.")) & cell_types[,1] == "cancer" 
# cell_types[,1][hmsc_rows]  = "HMSC"
# cell_types[,1][acc_rows]  = "ACC"

#add to metadata:
cell_types[,2] = NULL 
cell_types[cell_types$cell_type == "ACC1",] = "HMSC"
acc_all_cells = AddMetaData(object =acc_all_cells ,metadata = cell_types,col.name = "cell.type")

CNV UMAP

CNV plot

CNV plot ## {-}

CNV subtypes

cnv_subtypes = as.data.frame(cutree(infercnv_obj_default@tumor_subclusters[["hc"]][["HMSC"]], k = 2))
names(cnv_subtypes)[1] = "cnv.cluster"
rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "-",replacement = "\\.")
infercnv.observations = data.frame(fread(file = "./Data/inferCNV/infercnv.observations.txt"), row.names=1)
Warning in fread(file = "./Data/inferCNV/infercnv.observations.txt") :
  Detected 1332 column names but the data has 1333 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.
names_to_keep = colnames(infercnv.observations) %in% (colnames(acc1_cancer_cells) %>% gsub(pattern = "_",replacement = "\\."))
infercnv.observations = infercnv.observations[,names_to_keep]
rotate <- function(x) t(apply(x, 2, rev))
infercnv.observations2 = infercnv.observations %>% rotate() %>%  rotate() %>% rotate()%>% as.data.frame() 
breaks = c(0.700891861704857,
0.742366945528369,
0.783842029351881,
0.825317113175393,
0.866792196998905,
0.908267280822417,
0.949742364645928,
0.99121744846944,
1.03269253229295,
1.07416761611646,
1.11564269993998,
1.15711778376349,
1.198592867587,
1.24006795141051,
1.28154303523402,
1.32301811905753)
pheatmap(infercnv.observations2,cluster_cols = F,cluster_rows = F, show_rownames = F,show_colnames = F, breaks = breaks,color = colorRampPalette(rev(c("darkred", "white", "darkblue")))(15),annotation_row = cnv_subtypes)

rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "2\\.",replacement = "2_")
rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "3\\.",replacement = "3_")
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = cnv_subtypes)
DimPlot(acc1_cancer_cells,group.by = "cnv.cluster",pt.size = 2,cols =colors)

Original score

original_myo_genes = c( "TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI" )
calculate_score(dataset = all_acc_cancer_cells,myo_genes = original_myo_genes,lum_genes = original_lum_genes)
correlation of lum score and myo score: -0.45
correlation of lum score and original lum score: 1
correlation of myo score and original myo score: 1

Original score of ACC1

calculate_score(dataset = acc1_cancer_cells,myo_genes = original_myo_genes,lum_genes = original_lum_genes,lum_threshold = 0,myo_threshold = 0)
correlation of lum score and myo score: 0.06
correlation of lum score and original lum score: 1
correlation of myo score and original myo score: 1

0.35 Most correlated score

Myo genes

myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc1_cancer_cells, genes = myo_protein_markers,threshold = 0.35)
print("Number of genes = " %>% paste(length(top_myo)))
[1] "Number of genes =  20"
message("Names of genes:")
Names of genes:
top_myo %>% head(30)
 [1] "COL16A1"     "RP1-39G22.4" "ACTG2"       "CD200"       "MYLK"        "TP63"        "KCNMB1"      "ADAMTS2"    
 [9] "LOXL2"       "TPM2"        "CLIC3"       "SNCG"        "ACTA2"       "TAGLN"       "A2M"         "NGFR"       
[17] "CNN1"        "PPP1R14A"    "MYL9"        "POM121L9P"  
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_myo,original_myo_genes) 
[1] "MYLK"  "TP63"  "ACTA2" "TAGLN"
myo_enrich_res = genes_vec_enrichment(genes = top_myo,background = rownames(acc1_cancer_cells),homer = T,title = "myo top enrichment",custom_pathways = luminal_gs)

myo_enrich_res

Lum genes

lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc1_cancer_cells, genes = lum_protein_markers,threshold = 0.35)
Warning in cor(expression %>% t(), markers_average) :
  the standard deviation is zero
print("Number of genes = " %>% paste(length(top_lum)))
[1] "Number of genes =  22"
message("Names of genes:")
Names of genes:
top_lum %>% head(30)
 [1] "FBXO44"        "USP48"         "MPC2"          "SLC19A2"       "ATL2"          "B3GNT2"        "AC093162.5"   
 [8] "MAP2"          "KIT"           "GLRB"          "EFNA5"         "PCDHGB7"       "SLC29A1"       "SASH1"        
[15] "ALDH3B2"       "CCND1"         "RP11-254B13.1" "VSIG10"        "NDFIP2"        "SUSD6"         "SPPL2A"       
[22] "DSC2"         
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_lum,original_lum_genes) 
[1] "KIT"
lum_enrich_res = genes_vec_enrichment(genes = top_lum,background = rownames(acc1_cancer_cells),homer = T,title = "lum top enrichment",custom_pathways = luminal_gs)

lum_enrich_res

top correlated score

calculate_score(dataset = acc1_cancer_cells,myo_genes = top_myo,lum_genes = top_lum,lum_threshold = 0,myo_threshold = 0)
correlation of lum score and myo score: -0.05
correlation of lum score and original lum score: 0.58
correlation of myo score and original myo score: 0.77

enriched genes score

rownames(lum_enrich_res) = lum_enrich_res$pathway_name
lum_enriched_genes = lum_enrich_res[1,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,lum_protein_markers) #add original markers
rownames(myo_enrich_res) = myo_enrich_res$pathway_name
myo_enriched_genes = myo_enrich_res[1,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,myo_protein_markers) #add original markers
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = -2.5,myo_threshold = -2.5)
correlation of lum score and myo score: -0.16
correlation of lum score and original lum score: 0.43
correlation of myo score and original myo score: 0.79

0.2 Most correlated score

myo Genes

myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc1_cancer_cells, genes = myo_protein_markers,threshold = 0.2)
print("Number of genes = " %>% paste(length(top_myo)))
[1] "Number of genes =  473"
message("Names of genes:")
Names of genes:
top_myo %>% head(30)
 [1] "MIR429"        "EXOSC10"       "PLOD1"         "TNFRSF8"       "PDPN"          "FBLIM1"        "RP11-276H7.3" 
 [8] "HSPG2"         "EPHA8"         "IFNLR1"        "GRHL3"         "RP3-426I6.6"   "SDC3"          "RP11-490K7.1" 
[15] "RP11-73M7.1"   "TINAGL1"       "COL16A1"       "RP1-117O3.2"   "RP11-435D7.3"  "RP1-39G22.4"   "SCMH1"        
[22] "RP5-850O15.4"  "RP5-866L20.3"  "NEXN"          "RP4-714D9.2"   "RP5-837M10.3"  "RP5-964H19.1"  "CSF1"         
[29] "RP11-498A13.1" "RP11-96K19.2" 
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_myo,original_myo_genes) 
[1] "MYLK"  "TP63"  "ACTA2" "DKK3"  "TAGLN" "CDH3" 
myo_enrich_res = genes_vec_enrichment(genes = top_myo,background = rownames(acc1_cancer_cells),homer = T,title = "myo top enrichment",custom_pathways = luminal_gs)

myo_enrich_res

Lum Genes

lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc1_cancer_cells, genes = lum_protein_markers,threshold = 0.2)
Warning in cor(expression %>% t(), markers_average) :
  the standard deviation is zero
print("Number of genes = " %>% paste(length(top_lum)))
[1] "Number of genes =  1535"
message("Names of genes:")
Names of genes:
top_lum %>% head(30)
 [1] "RP11-206L10.11" "HES4"           "RP11-54O7.18"   "CPTP"           "RP3-395M20.8"   "TNFRSF14"      
 [7] "RPL22"          "ICMT"           "ERRFI1"         "RERE"           "H6PD"           "TMEM201"       
[13] "NMNAT1"         "UBE4B"          "APITD1-CORT"    "CENPS"          "RP4-635E18.7"   "FBXO2"         
[19] "FBXO44"         "UQCRHL"         "MST1P2"         "SDHB"           "PADI2"          "ARHGEF10L"     
[25] "AKR7A2"         "TMCO4"          "EIF4G3"         "USP48"          "NIPAL3"         "PIGV"          
message("Genes that also apeared in the original score:")
Genes that also apeared in the original score:
base::intersect(top_lum,original_lum_genes) 
[1] "KIT"    "EHF"    "LGALS3"
lum_enrich_res = genes_vec_enrichment(genes = top_lum,background = rownames(acc1_cancer_cells),homer = T,title = "lum top enrichment",custom_pathways = luminal_gs)

lum_enrich_res
calculate_score = function(dataset,myo_genes,lum_genes,lum_threshold =1 , myo_threshold = -1) {
  myoscore=apply(dataset@assays[["RNA"]][myo_genes,],2,mean)
  lescore=apply(dataset@assays[["RNA"]][lum_genes,],2,mean)
  correlation = cor(lescore,myoscore) %>% round(digits = 2)
  message("correlation of lum score and myo score:" %>% paste(correlation))
  
  
  original_myo_genes = c("TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
  original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI")
  orig_myoscore=apply(dataset@assays[["RNA"]][original_myo_genes,],2,mean)
  orig_lescore=apply(dataset@assays[["RNA"]][original_lum_genes,],2,mean)
  correlation_to_original_lum = cor(orig_lescore,lescore) %>% round(digits = 2)
  correlation_to_original_myo = cor(orig_myoscore,myoscore) %>% round(digits = 2)

  message("correlation of lum score and original lum score:" %>% paste(correlation_to_original_lum))
  message("correlation of myo score and original myo score:" %>% paste(correlation_to_original_myo))

  dataset=AddMetaData(dataset,lescore-myoscore,"luminal_over_myo")
  print(
    FeaturePlot(object = dataset,features = "luminal_over_myo")
  )
  data = FetchData(object = dataset,vars = "luminal_over_myo")
  print(
    data %>% 
    ggplot(aes( x=luminal_over_myo)) + 
    geom_density() 
    )
  
lum_cells_num = subset(x = dataset,luminal_over_myo >(lum_threshold)) %>% ncol() /ncol(dataset)
myo_cells_num = subset(x = dataset,luminal_over_myo <(myo_threshold)) %>% ncol()/ncol(dataset)
df = data.frame(cell_type = c("myo_cells","lum_cells"),percentage = c(myo_cells_num,lum_cells_num))
ggplot(data=df, aes(x=cell_type, y=percentage)) +
  geom_bar(stat="identity") + ggtitle("ACC cell types")
}
myo_intersected = intersect(top_myo,original_myo_genes) 
lum_intersected = intersect(top_lum,original_lum_genes) 
message("genes in myo score:")
genes in myo score:
myo_intersected
[1] "MYLK"  "TP63"  "ACTA2" "DKK3"  "TAGLN" "CDH3" 
message("genes in lum score:")
genes in lum score:
lum_intersected
[1] "KIT"    "EHF"    "LGALS3"
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_intersected,lum_genes = lum_intersected,lum_threshold = 2,myo_threshold = 1)
correlation of lum score and myo score: 0.02
correlation of lum score and original lum score: 0.79
correlation of myo score and original myo score: 0.89

enriched genes

rownames(lum_enrich_res) = lum_enrich_res$pathway_name
lum_enriched_genes = lum_enrich_res[3,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,lum_protein_markers) #add original markers
rownames(myo_enrich_res) = myo_enrich_res$pathway_name
myo_enriched_genes = myo_enrich_res[3,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,myo_protein_markers) #add original markers
message("genes in myo score:")
genes in myo score:
myo_enriched_genes
 [1] "FBLIM1"  "NEXN"    "NMNAT2"  "MYLK"    "CCDC50"  "IGFBP7"  "RAI14"   "ARAP3"   "SPARC"   "CALD1"   "LOXL2"  
[12] "COL5A1"  "ACTA2"   "DKK3"    "MSRB3"   "COL4A1"  "ACTN1"   "TPM1"    "TGFB1I1" "ADORA2B" "MXRA7"   "CNN1"   
[23] "TP63"    "ACTA2"  
message("genes in lum score:")
genes in lum score:
lum_enriched_genes
 [1] "PADI2"      "PATJ"       "PEX11B"     "APH1A"      "C1orf43"    "EFNA4"      "NECTIN4"    "SCYL3"      "ELF3"      
[10] "SOX13"      "IRF6"       "MED28"      "EPB41L4A"   "RGL2"       "C6orf132"   "TPD52L1"    "ICA1"       "MACC1"     
[19] "TRPS1"      "FAM83H"     "RASEF"      "ARRDC1"     "COMMD3"     "ANK3"       "GSTO2"      "PDCD4"      "EHF"       
[28] "ALDH3B2"    "SHANK2"     "SORL1"      "FKBP4"      "PTPN6"      "DUSP16"     "RERG"       "ADCY6"      "ERBB3"     
[37] "ERP29"      "SUSD6"      "RPS6KA5"    "SPINT1"     "FEM1B"      "TLE3"       "SCAMP2"     "CLN3"       "ADGRG1"    
[46] "ATP2C2"     "GGT6"       "MYO1D"      "ST6GALNAC2" "CYB5A"      "BLVRB"      "VRK3"       "SYCP2"      "TMPRSS2"   
[55] "LIMK2"      "KIT"       
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = 0,myo_threshold = -1)
correlation of lum score and myo score: 0.1
correlation of lum score and original lum score: 0.71
correlation of myo score and original myo score: 0.76

enriched genes and in original score

myo_enriched_genes = myo_enriched_genes[myo_enriched_genes %in% original_myo_genes]
lum_enriched_genes = lum_enriched_genes[lum_enriched_genes %in% original_lum_genes]

message("genes in myo score:")
genes in myo score:
myo_enriched_genes
[1] "MYLK"  "ACTA2" "DKK3"  "TP63"  "ACTA2"
message("genes in lum score:")
genes in lum score:
lum_enriched_genes
[1] "EHF" "KIT"
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = 2,myo_threshold = 2)
correlation of lum score and myo score: -0.07
correlation of lum score and original lum score: 0.62
correlation of myo score and original myo score: 0.77

HPV

Only HMSC cancer cells:

HPV33_P3 = fread("./Data/HPV33_P3.txt",col.names = c("plate","reads")) %>% as.data.frame()
HPV33_P3.df = HPV33_P3 %>% mutate(
  plate = gsub(x =HPV33_P3$plate, replacement = "",pattern = "_.*$") 
  %>% gsub(pattern = "-P",replacement = ".P") 
  %>% gsub(pattern = "-",replacement = "_",)
  )
HPV33_P3.df = HPV33_P3.df %>% dplyr::filter(HPV33_P3.df$plate %in% colnames(acc1_cancer_cells))
rownames(HPV33_P3.df)  <- HPV33_P3.df$plate
HPV33_P3.df$plate = NULL


HPV33_P2 = fread("./Data/HPV33_P2.txt",col.names = c("plate","reads")) %>% as.data.frame()
HPV33_P2.df = HPV33_P2 %>% mutate(
  plate = gsub(x =HPV33_P2$plate, replacement = "",pattern = "_.*$") 
  %>% gsub(pattern = "plate2-",replacement = "plate2_",)
  %>% gsub(pattern = "-",replacement = "\\.",)
  )
HPV33_P2.df = HPV33_P2.df %>% dplyr::filter(HPV33_P2.df$plate %in% colnames(acc1_cancer_cells))
rownames(HPV33_P2.df)  <- HPV33_P2.df$plate
HPV33_P2.df$plate = NULL

HPV33 = rbind(HPV33_P3.df,HPV33_P2.df)
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = HPV33,col.name = "HPV33.reads")
FeaturePlot(acc1_cancer_cells,features = "HPV33.reads",max.cutoff = 40)

data = FetchData(object = acc1_cancer_cells,vars = "HPV33.reads")
print(
  data %>% 
  ggplot(aes( x=HPV33.reads)) + 
  geom_density()
)
hpv33_positive = HPV33 %>% dplyr::mutate(hpv33_positive = if_else(reads > 20,true =  "positive",false =  "negative"))
hpv33_positive$reads = NULL
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = hpv33_positive)

cNMF

library(reticulate)
#write expression
acc1_cancer_cells = FindVariableFeatures(object = acc1_cancer_cells,nfeatures = 2000)
vargenes = VariableFeatures(object = acc1_cancer_cells)
hmsc_expression = t(as.matrix(GetAssayData(acc1_cancer_cells,slot='data')))
hmsc_expression = 2**hmsc_expression #convert from log2(tpm+1) to tpm
hmsc_expression = hmsc_expression-1
# hmsc_expression = hmsc_expression[,!colSums(hmsc_expression==0, na.rm=TRUE)==nrow(hmsc_expression)] #delete rows that have all 0
hmsc_expression = hmsc_expression[,vargenes]
write.table(x = hmsc_expression ,file = './Data/cNMF/hmsc_expressionData_2Kvargenes.txt',sep = "\t")
from cnmf import cNMF
name = 'HMSC_cNMF_2Kvargenes'
outdir = './Data/cNMF'
K_range = np.arange(3,10)
cnmf_obj = cNMF(output_dir=outdir, name=name)
counts_fn='./Data/cNMF/hmsc_expressionData_2Kvargenes.txt'
tpm_fn = counts_fn ## This is a weird case where because this dataset is not 3' end umi sequencing, we opted to use the TPM matrix as the input matrix rather than the count matrix

cnmf_obj.prepare(counts_fn=counts_fn, components=K_range, seed=14,tpm_fn=tpm_fn)
cnmf_obj.factorize(worker_i=0, total_workers=1)
cnmf_obj.combine()
cnmf_obj.k_selection_plot()

Save object

import pickle
f = open('./Data/cNMF/HMSC_cNMF_2Kvargenes/cnmf_obj.pckl', 'wb')
pickle.dump(cnmf_obj, f)
f.close()

Load object

from cnmf import cNMF
import pickle
f = open('./Data/cNMF/HMSC_cNMF_2Kvargenes/cnmf_obj.pckl', 'rb')
cnmf_obj = pickle.load(f)
f.close()
selected_k = 4
density_threshold = 0.1
cnmf_obj.consensus(k=selected_k, density_threshold=density_threshold,show_clustering=True)
/sci/labs/yotamd/lab_share/avishai.wizel/python_envs/miniconda/envs/cnmf_env_6/lib/python3.7/site-packages/scanpy/preprocessing/_simple.py:843: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_k, density_threshold=density_threshold)
# calculate usage by expression*genes coefs:
gep_scores = py$gep_scores
all_metagenes= py$usage_norm

Enrichment analysis by top 200 genes of each program

plt_list = list()
for (i in 1:ncol(gep_scores)) {
  top_genes = gep_scores  %>%  arrange(desc(gep_scores[i])) #sort by score a
  top = head(rownames(top_genes),200) #take top top_genes_num
  res = genes_vec_enrichment(genes = top,background = rownames(gep_scores),homer = T,title = 
                    i,silent = T,return_all = T)
   
  plt_list[[i]] = res$plt
}
gridExtra::grid.arrange(grobs = plt_list)

# Make metagene names
for (i in 1:ncol(all_metagenes)) {
  colnames(all_metagenes)[i] = "metagene." %>% paste0(i)
}

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = metage_metadata)
}

FeaturePlot(object = acc1_cancer_cells,features = colnames(all_metagenes))

assignment UMAP

larger_by = 2
message(paste("larger_by = ", larger_by))
larger_by =  2
acc1_cancer_cells = program_assignment(dataset = acc1_cancer_cells,larger_by = larger_by,program_names = colnames(all_metagenes))
selected_k = py$selected_k
colors =  rainbow(selected_k)
colors = c(colors,"grey")
DimPlot(acc1_cancer_cells,group.by = "program.assignment",pt.size = 2,cols =colors)

Comparisions

cnv_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("cnv.cluster","hpv33_positive"))
test <- fisher.test(table(cnv_vs_hpv))
ggbarstats(
  cnv_vs_hpv, cnv.cluster, hpv33_positive,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

cnv_vs_cnmf = FetchData(object = acc1_cancer_cells,vars = c("program.assignment","cnv.cluster"))
# cnv_vs_hpv =cnv_vs_hpv %>%  mutate(program.assignment = if_else(condition = program.assignment == "1",true = "1",false = "not 1"))
cnv_vs_cnmf = cnv_vs_cnmf %>% dplyr::filter(program.assignment == "1" |program.assignment == "2" )
test <- fisher.test(table(cnv_vs_cnmf))
ggbarstats(
  cnv_vs_cnmf, program.assignment, cnv.cluster,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

cnmf_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("program.assignment","hpv33_positive"))
cnmf_vs_hpv = cnmf_vs_hpv %>% dplyr::filter(program.assignment == "1" |program.assignment == "2" )
test <- fisher.test(table(cnmf_vs_hpv))
ggbarstats(
  cnmf_vs_hpv, program.assignment, hpv33_positive,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

myb_vs_cnv = FetchData(object = acc1_cancer_cells,vars = c("cnv.cluster","MYB"))
myb_vs_cnv $cnv.cluster = as.character(myb_vs_cnv $cnv.cluster )

ggboxplot(myb_vs_cnv, x = "cnv.cluster", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("1","2")))

myb_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))

hpvReads_vs_myb = FetchData(object = acc1_cancer_cells,vars = c("HPV33.reads","MYB"))
corr = cor(hpvReads_vs_myb$HPV33.reads,hpvReads_vs_myb$MYB)
print("correlation of MYB abd hpv33_reads:" %>% paste(corr %>% round(digits = 2)) )
[1] "correlation of MYB abd hpv33_reads: 0.21"
---
title: "Title"
author: "Avishai Wizel"
date: '`r Sys.time()`'
output: 
  html_notebook: 
    code_folding: hide
---
```{r, include=FALSE}
knitr::opts_chunk$set(
  include = T
)
```

## Data

```{r}
acc1_cancer_cells = readRDS("./Data/acc1_cancer_cells_15KnCount_V3.RDS")
all_acc_cancer_cells = readRDS("./Data/acc_cancer_cells_V3.RDS")
acc_all_cells = readRDS("./Data/acc_tpm_nCount_mito_no146_15k_with_ACC1_.RDS")
luminal_pathways = c("CHARAFE_BREAST_CANCER_LUMINAL_VS_BASAL_DN","CHARAFE_BREAST_CANCER_LUMINAL_VS_BASAL_UP","CHARAFE_BREAST_CANCER_LUMINAL_VS_MESENCHYMAL_DN","CHARAFE_BREAST_CANCER_LUMINAL_VS_MESENCHYMAL_UP","HUPER_BREAST_BASAL_VS_LUMINAL_DN","LIM_MAMMARY_LUMINAL_PROGENITOR_UP","SMID_BREAST_CANCER_LUMINAL_B_UP" )

# add luminal pathways
luminal_gs = msigdbr(species = "Homo sapiens") %>%as.data.frame() %>% dplyr::filter(gs_name %in% luminal_pathways)%>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()

```

## Parameters

```{r warning=FALSE}
suffix = "01_25"
data_to_read = ""
```


## functions

```{r warning=FALSE}
source_from_github(repositoy = "DEG_functions",version = "0.2.24")
source_from_github(repositoy = "HMSC_functions",version = "0.1.11",script_name = "functions.R")
source_from_github(repositoy = "cNMF_functions",version = "0.3.53",script_name = "cnmf_function_Harmony.R")
```


## Enrichment analysis HMSC vs ACC
```{r fig.width=8, echo=TRUE,results='hide',fig.keep='all'}
patient.ident = all_acc_cancer_cells$patient.ident %>% as.data.frame()
patient.ident[,1] = as.character(patient.ident[,1])
patient.ident[patient.ident[,1] == "ACC1",] = "HMSC"
patient.ident[,1] = as.factor(patient.ident[,1])
all_acc_cancer_cells = AddMetaData(object = all_acc_cancer_cells,metadata = patient.ident,col.name = "patient.ident")
all_acc_cancer_cells = SetIdent(all_acc_cancer_cells, value ="patient.ident")
acc_deg <- FindMarkers(all_acc_cancer_cells, ident.1 = "HMSC",logfc.threshold = 1.5,features = VariableFeatures(all_acc_cancer_cells))
enrichment_analysis(acc_deg,background = VariableFeatures(all_acc_cancer_cells),fdr_Cutoff = 0.01,ident.1 = "HMSC",ident.2 = "ACC",show_by = 1)
```

## cell cycle filtering {.tabset}


```{r warning=FALSE}
hallmark_name = "GO_MITOTIC_CELL_CYCLE"
genesets  =getGmt("./Data/h.all.v7.0.symbols.pluscc.gmt")
var_features=all_acc_cancer_cells@assays$RNA@var.features
geneIds= genesets[[hallmark_name]]@geneIds
score <- apply(all_acc_cancer_cells@assays$RNA@scale.data[intersect(geneIds,var_features),],2,mean)
all_acc_cancer_cells=AddMetaData(all_acc_cancer_cells,score,hallmark_name)

#filter:
all_acc_cancer_cells_ccFiltered=all_acc_cancer_cells[,all_acc_cancer_cells@meta.data[[hallmark_name]]< 0.3]


min_threshold = min(all_acc_cancer_cells$GO_MITOTIC_CELL_CYCLE)
max_threshold = max(all_acc_cancer_cells$GO_MITOTIC_CELL_CYCLE)
```


### Before cc filtering


```{r}
FeaturePlot(object = all_acc_cancer_cells,features = hallmark_name) + ggtitle("Before cc filtering")  & scale_color_gradientn(colours = plasma(n = 10, direction = -1), limits = c(min_threshold, max_threshold))
```

### After cc filtering

```{r}
FeaturePlot(object = all_acc_cancer_cells_ccFiltered,features = hallmark_name) + ggtitle("After cc filtering") & scale_color_gradientn(colours = plasma(n = 10, direction = -1), limits = c(min_threshold, max_threshold))
```
## {-}

## Enrichment analysis filtered HMSC vs ACC
```{r fig.width=8, echo=TRUE,results='hide',fig.keep='all'}
patient.ident = all_acc_cancer_cells_ccFiltered$patient.ident %>% as.data.frame()
patient.ident[,1] = as.character(patient.ident[,1])
patient.ident[patient.ident[,1] == "ACC1",] = "HMSC"
patient.ident[,1] = as.factor(patient.ident[,1])
all_acc_cancer_cells_ccFiltered = AddMetaData(object = all_acc_cancer_cells_ccFiltered,metadata = patient.ident,col.name = "patient.ident")
all_acc_cancer_cells_ccFiltered = SetIdent(all_acc_cancer_cells_ccFiltered, value ="patient.ident")
acc_deg <- FindMarkers(all_acc_cancer_cells_ccFiltered, ident.1 = "HMSC",logfc.threshold = 1.5,features = VariableFeatures(all_acc_cancer_cells_ccFiltered))
enrichment_analysis(acc_deg,background = VariableFeatures(all_acc_cancer_cells_ccFiltered),fdr_Cutoff = 0.01,ident.1 = "HMSC",ident.2 = "ACC",show_by = 1)
```

## MYB expression

```{r fig.width=10}
all_acc_cancer_cells = SetIdent(object = all_acc_cancer_cells,value = "patient.ident") #active snn graph
FeaturePlot(object = all_acc_cancer_cells,features = "MYB",label = T)
```

## CNV {.tabset}



```{r}
#set cell types
new.cluster.ids <- c("cancer", #0
                     "cancer", #1
                     "CAF", #2
                     "cancer", #3
                     "Endothelial", #4
                     "cancer", #5
                     "cancer", #6
                     "CAF", #7
                     "CAF", #8
                     "CAF", #9
                     "cancer", #10
                     "CAF", #11
                     "cancer", #12
                     "cancer", #13
                     "cancer", #14
                     "cancer", #15
                     "cancer", #16
                     "WBC", #17
                     "CAF" #18
                     )
```


```{r fig.show='hide'}
#rename idents:
acc_all_cells = SetIdent(object = acc_all_cells,value = "RNA_snn_res.1") #active snn graph
names(new.cluster.ids) <- levels(acc_all_cells) #add snn graph levels to new.cluster.ids
acc_all_cells@meta.data[["seurat_clusters"]] = acc_all_cells@meta.data[["RNA_snn_res.1"]]
acc_all_cells = SetIdent(object = acc_all_cells,value = "seurat_clusters")
acc_all_cells <- RenameIdents(acc_all_cells, new.cluster.ids) 

# divide "cancer" into patients:
cell_types = acc_all_cells@active.ident %>% as.data.frame()
cell_types[,1]<- as.character(cell_types[,1])
cell_types = cbind(cell_types,acc_all_cells$patient.ident) %>% setnames(old = names(.), 
         new = c('cell_type','patient'))
cell_types[cell_types$cell_type == "cancer",] = cell_types[cell_types$cell_type == "cancer",2]


# hmsc_rows = (startsWith(x = rownames(cell_types),prefix = "ACC.plate2") | startsWith(x = rownames(cell_types),prefix = "ACC1.")) & cell_types[,1] == "cancer" 
# acc_rows = !(startsWith(x = rownames(cell_types),prefix = "ACC.plate2") | startsWith(x = rownames(cell_types),prefix = "ACC1.")) & cell_types[,1] == "cancer" 
# cell_types[,1][hmsc_rows]  = "HMSC"
# cell_types[,1][acc_rows]  = "ACC"

#add to metadata:
cell_types[,2] = NULL 
cell_types[cell_types$cell_type == "ACC1",] = "HMSC"
acc_all_cells = AddMetaData(object =acc_all_cells ,metadata = cell_types,col.name = "cell.type")
```
### CNV UMAP 

```{r fig.width=10}
library(infercnv)
library(tidyverse)
acc_annotation  = as.data.frame(acc_all_cells@meta.data[,"cell.type",drop = F])
acc_annotation = acc_annotation %>% rownames_to_column("orig.ident") 
acc_annotation = acc_annotation %>% mutate(orig.ident = gsub(x = acc_annotation$orig.ident,pattern = "\\.", replacement = "-") %>% 
  gsub(pattern = "_", replacement = "-"))
  

write.table(acc_annotation, "./Data/inferCNV/acc_annotation.txt", append = FALSE, 
            sep = "\t", dec = ".",row.names = FALSE, col.names = F)

infercnv_obj = CreateInfercnvObject(raw_counts_matrix="./Data/inferCNV/all.4icnv.txt", 
                                    annotations_file="./Data/inferCNV/acc_annotation.txt",
                                    delim="\t",gene_order_file="./Data/inferCNV/gencode_v19_gene_pos.txt"
                                    ,ref_group_names=c("CAF", "Endothelial", "WBC")) #groups of normal cells

infercnv_obj_default = infercnv::run(infercnv_obj, cutoff=1, out_dir='./Data/inferCNV/infercnv_output',
                                     cluster_by_groups=T, plot_steps=FALSE,
                                     denoise=TRUE, HMM=FALSE, no_prelim_plot=TRUE,
                                     png_res=300)
plot_cnv(infercnv_obj_default, output_format = "png",  write_expr_matrix = FALSE,out_dir = "./Data/inferCNV/",png_res	=800,obs_title = "Malignant cells",ref_title = "Normal cells")


cluster.info=FetchData(acc_all_cells,c("ident","orig.ident","UMAP_1","UMAP_2","nCount_RNA","nFeature_RNA","percent.mt","patient.ident","seurat_clusters"))
cluster.info$cell=rownames(cluster.info)

library(limma)
smoothed=apply(infercnv_obj_default@expr.data,2,tricubeMovingAverage, span=0.01)
cnsig=sqrt(apply((smoothed-1)^2,2,mean))
umap=cluster.info
names(cnsig)=umap$cell

acc_all_cells <- AddMetaData(object = acc_all_cells, metadata = cnsig, col.name = "copynumber")
acc_all_cells = SetIdent(object = acc_all_cells,value = "cell.type")
FeaturePlot(acc_all_cells, "copynumber",pt.size = 1, cols = c("lightblue","orange","red","darkred"),label = T,repel = T)
```


### CNV plot 

![CNV plot](/sci/labs/yotamd/lab_share/avishai.wizel/R_projects/HMSC/Data/inferCNV/infercnv.png)
## {-}

## CNV subtypes

```{r}
cnv_subtypes = as.data.frame(cutree(infercnv_obj_default@tumor_subclusters[["hc"]][["HMSC"]], k = 2))
names(cnv_subtypes)[1] = "cnv.cluster"
rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "-",replacement = "\\.")
infercnv.observations = data.frame(fread(file = "./Data/inferCNV/infercnv.observations.txt"), row.names=1)
names_to_keep = colnames(infercnv.observations) %in% (colnames(acc1_cancer_cells) %>% gsub(pattern = "_",replacement = "\\."))
infercnv.observations = infercnv.observations[,names_to_keep]
```

```{r}
rotate <- function(x) t(apply(x, 2, rev))
infercnv.observations2 = infercnv.observations %>% rotate() %>%  rotate() %>% rotate()%>% as.data.frame() 
breaks = c(0.700891861704857,
0.742366945528369,
0.783842029351881,
0.825317113175393,
0.866792196998905,
0.908267280822417,
0.949742364645928,
0.99121744846944,
1.03269253229295,
1.07416761611646,
1.11564269993998,
1.15711778376349,
1.198592867587,
1.24006795141051,
1.28154303523402,
1.32301811905753)
pheatmap(infercnv.observations2,cluster_cols = F,cluster_rows = F, show_rownames = F,show_colnames = F, breaks = breaks,color = colorRampPalette(rev(c("darkred", "white", "darkblue")))(15),annotation_row = cnv_subtypes)

```

```{r}
rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "2\\.",replacement = "2_")
rownames(cnv_subtypes) = rownames(cnv_subtypes) %>% gsub(pattern = "3\\.",replacement = "3_")
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = cnv_subtypes)
```

```{r}
DimPlot(acc1_cancer_cells,group.by = "cnv.cluster",pt.size = 2,cols =colors)
```


## Original score
```{r}
original_myo_genes = c( "TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI" )
```



```{r}
calculate_score(dataset = all_acc_cancer_cells,myo_genes = original_myo_genes,lum_genes = original_lum_genes)
```
## Original score of ACC1
```{r}
calculate_score(dataset = acc1_cancer_cells,myo_genes = original_myo_genes,lum_genes = original_lum_genes,lum_threshold = 0,myo_threshold = 0)
```


## 0.35 Most correlated score {.tabset}

### Myo genes


```{r warning=FALSE, collapse=T}
myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc1_cancer_cells, genes = myo_protein_markers,threshold = 0.35)
print("Number of genes = " %>% paste(length(top_myo)))
message("Names of genes:")
top_myo %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_myo,original_myo_genes) 
```
```{r}
myo_enrich_res = genes_vec_enrichment(genes = top_myo,background = rownames(acc1_cancer_cells),homer = T,title = "myo top enrichment",custom_pathways = luminal_gs)
myo_enrich_res
```
### Lum genes
```{r}
lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc1_cancer_cells, genes = lum_protein_markers,threshold = 0.35)
print("Number of genes = " %>% paste(length(top_lum)))
message("Names of genes:")
top_lum %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_lum,original_lum_genes) 
```

```{r}
lum_enrich_res = genes_vec_enrichment(genes = top_lum,background = rownames(acc1_cancer_cells),homer = T,title = "lum top enrichment",custom_pathways = luminal_gs)
lum_enrich_res
```
### top correlated score
```{r}
calculate_score(dataset = acc1_cancer_cells,myo_genes = top_myo,lum_genes = top_lum,lum_threshold = 0,myo_threshold = 0)
```


###  enriched genes score
```{r}
rownames(lum_enrich_res) = lum_enrich_res$pathway_name
lum_enriched_genes = lum_enrich_res[1,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,lum_protein_markers) #add original markers
```

```{r}
rownames(myo_enrich_res) = myo_enrich_res$pathway_name
myo_enriched_genes = myo_enrich_res[1,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,myo_protein_markers) #add original markers
```

```{r}
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = -2.5,myo_threshold = -2.5)
```


## {-}


##  0.2 Most correlated score {.tabset}

###  myo Genes


```{r warning=FALSE}
myo_protein_markers = c("CNN1", "TP63","ACTA2")
top_myo  = top_correlated(dataset = acc1_cancer_cells, genes = myo_protein_markers,threshold = 0.2)
print("Number of genes = " %>% paste(length(top_myo)))
message("Names of genes:")
top_myo %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_myo,original_myo_genes) 
```

```{r}
myo_enrich_res = genes_vec_enrichment(genes = top_myo,background = rownames(acc1_cancer_cells),homer = T,title = "myo top enrichment",custom_pathways = luminal_gs)
myo_enrich_res
```
###  Lum Genes

```{r}
lum_protein_markers = c("KIT")
top_lum  = top_correlated(dataset = acc1_cancer_cells, genes = lum_protein_markers,threshold = 0.2)
print("Number of genes = " %>% paste(length(top_lum)))
message("Names of genes:")
top_lum %>% head(30)
message("Genes that also apeared in the original score:")
base::intersect(top_lum,original_lum_genes) 
```

```{r}
lum_enrich_res = genes_vec_enrichment(genes = top_lum,background = rownames(acc1_cancer_cells),homer = T,title = "lum top enrichment",custom_pathways = luminal_gs)
lum_enrich_res
```
```{r}
calculate_score = function(dataset,myo_genes,lum_genes,lum_threshold =1 , myo_threshold = -1) {
  myoscore=apply(dataset@assays[["RNA"]][myo_genes,],2,mean)
  lescore=apply(dataset@assays[["RNA"]][lum_genes,],2,mean)
  correlation = cor(lescore,myoscore) %>% round(digits = 2)
  message("correlation of lum score and myo score:" %>% paste(correlation))
  
  
  original_myo_genes = c("TP63", "TP73", "CAV1", "CDH3", "KRT5", "KRT14", "ACTA2", "TAGLN", "MYLK", "DKK3")
  original_lum_genes = c("KIT", "EHF", "ELF5", "KRT7", "CLDN3", "CLDN4", "CD24", "LGALS3", "LCN2", "SLPI")
  orig_myoscore=apply(dataset@assays[["RNA"]][original_myo_genes,],2,mean)
  orig_lescore=apply(dataset@assays[["RNA"]][original_lum_genes,],2,mean)
  correlation_to_original_lum = cor(orig_lescore,lescore) %>% round(digits = 2)
  correlation_to_original_myo = cor(orig_myoscore,myoscore) %>% round(digits = 2)

  message("correlation of lum score and original lum score:" %>% paste(correlation_to_original_lum))
  message("correlation of myo score and original myo score:" %>% paste(correlation_to_original_myo))

  dataset=AddMetaData(dataset,lescore-myoscore,"luminal_over_myo")
  print(
    FeaturePlot(object = dataset,features = "luminal_over_myo")
  )
  data = FetchData(object = dataset,vars = "luminal_over_myo")
  print(
    data %>% 
    ggplot(aes( x=luminal_over_myo)) + 
    geom_density() 
    )
  
lum_cells_num = subset(x = dataset,luminal_over_myo >(lum_threshold)) %>% ncol() /ncol(dataset)
myo_cells_num = subset(x = dataset,luminal_over_myo <(myo_threshold)) %>% ncol()/ncol(dataset)
df = data.frame(cell_type = c("myo_cells","lum_cells"),percentage = c(myo_cells_num,lum_cells_num))
ggplot(data=df, aes(x=cell_type, y=percentage)) +
  geom_bar(stat="identity") + ggtitle("ACC cell types")
}

```

```{r}
myo_intersected = intersect(top_myo,original_myo_genes) 
lum_intersected = intersect(top_lum,original_lum_genes) 
message("genes in myo score:")
myo_intersected

message("genes in lum score:")
lum_intersected
calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_intersected,lum_genes = lum_intersected,lum_threshold = 2,myo_threshold = 1)
```



### enriched genes
```{r}
rownames(lum_enrich_res) = lum_enrich_res$pathway_name
lum_enriched_genes = lum_enrich_res[3,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,lum_protein_markers) #add original markers
```

```{r}
rownames(myo_enrich_res) = myo_enrich_res$pathway_name
myo_enriched_genes = myo_enrich_res[3,"geneID"] %>% strsplit(split = "/") %>% .[[1]] %>% c(.,myo_protein_markers) #add original markers
```

```{r}
message("genes in myo score:")
myo_enriched_genes

message("genes in lum score:")
lum_enriched_genes

calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = 0,myo_threshold = -1)
```


### enriched genes and in original score
```{r}
myo_enriched_genes = myo_enriched_genes[myo_enriched_genes %in% original_myo_genes]
lum_enriched_genes = lum_enriched_genes[lum_enriched_genes %in% original_lum_genes]

message("genes in myo score:")
myo_enriched_genes

message("genes in lum score:")
lum_enriched_genes

calculate_score(dataset = acc1_cancer_cells,myo_genes = myo_enriched_genes,lum_genes = lum_enriched_genes,lum_threshold = 2,myo_threshold = 2)
```

## {-}


# HPV

Only HMSC cancer cells:

```{r}
HPV33_P3 = fread("./Data/HPV33_P3.txt",col.names = c("plate","reads")) %>% as.data.frame()
HPV33_P3.df = HPV33_P3 %>% mutate(
  plate = gsub(x =HPV33_P3$plate, replacement = "",pattern = "_.*$") 
  %>% gsub(pattern = "-P",replacement = ".P") 
  %>% gsub(pattern = "-",replacement = "_",)
  )
HPV33_P3.df = HPV33_P3.df %>% dplyr::filter(HPV33_P3.df$plate %in% colnames(acc1_cancer_cells))
rownames(HPV33_P3.df)  <- HPV33_P3.df$plate
HPV33_P3.df$plate = NULL


HPV33_P2 = fread("./Data/HPV33_P2.txt",col.names = c("plate","reads")) %>% as.data.frame()
HPV33_P2.df = HPV33_P2 %>% mutate(
  plate = gsub(x =HPV33_P2$plate, replacement = "",pattern = "_.*$") 
  %>% gsub(pattern = "plate2-",replacement = "plate2_",)
  %>% gsub(pattern = "-",replacement = "\\.",)
  )
HPV33_P2.df = HPV33_P2.df %>% dplyr::filter(HPV33_P2.df$plate %in% colnames(acc1_cancer_cells))
rownames(HPV33_P2.df)  <- HPV33_P2.df$plate
HPV33_P2.df$plate = NULL

HPV33 = rbind(HPV33_P3.df,HPV33_P2.df)
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = HPV33,col.name = "HPV33.reads")
FeaturePlot(acc1_cancer_cells,features = "HPV33.reads",max.cutoff = 40)
```
```{r}
data = FetchData(object = acc1_cancer_cells,vars = "HPV33.reads")
print(
  data %>% 
  ggplot(aes( x=HPV33.reads)) + 
  geom_density()
)
```
```{r}
hpv33_positive = HPV33 %>% dplyr::mutate(hpv33_positive = if_else(reads > 20,true =  "positive",false =  "negative"))
hpv33_positive$reads = NULL
acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = hpv33_positive)
```

# cNMF
```{r}
library(reticulate)
```

```{r}
#write expression
acc1_cancer_cells = FindVariableFeatures(object = acc1_cancer_cells,nfeatures = 2000)
vargenes = VariableFeatures(object = acc1_cancer_cells)
hmsc_expression = t(as.matrix(GetAssayData(acc1_cancer_cells,slot='data')))
hmsc_expression = 2**hmsc_expression #convert from log2(tpm+1) to tpm
hmsc_expression = hmsc_expression-1
# hmsc_expression = hmsc_expression[,!colSums(hmsc_expression==0, na.rm=TRUE)==nrow(hmsc_expression)] #delete rows that have all 0
hmsc_expression = hmsc_expression[,vargenes]
write.table(x = hmsc_expression ,file = './Data/cNMF/hmsc_expressionData_2Kvargenes.txt',sep = "\t")
```



```{python eval=F}
from cnmf import cNMF
name = 'HMSC_cNMF_2Kvargenes'
outdir = './Data/cNMF'
K_range = np.arange(3,10)
cnmf_obj = cNMF(output_dir=outdir, name=name)
counts_fn='./Data/cNMF/hmsc_expressionData_2Kvargenes.txt'
tpm_fn = counts_fn ## This is a weird case where because this dataset is not 3' end umi sequencing, we opted to use the TPM matrix as the input matrix rather than the count matrix

cnmf_obj.prepare(counts_fn=counts_fn, components=K_range, seed=14,tpm_fn=tpm_fn)
```

```{python eval=F}
cnmf_obj.factorize(worker_i=0, total_workers=1)
```

```{python eval=F}
cnmf_obj.combine()
cnmf_obj.k_selection_plot()
```
## Save object
```{python eval=F}
import pickle
f = open('./Data/cNMF/HMSC_cNMF_2Kvargenes/cnmf_obj.pckl', 'wb')
pickle.dump(cnmf_obj, f)
f.close()
```


## Load object
```{python}
from cnmf import cNMF
import pickle
f = open('./Data/cNMF/HMSC_cNMF_2Kvargenes/cnmf_obj.pckl', 'rb')
cnmf_obj = pickle.load(f)
f.close()
```


```{python}
selected_k = 4
density_threshold = 0.1
cnmf_obj.consensus(k=selected_k, density_threshold=density_threshold,show_clustering=True)
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_k, density_threshold=density_threshold)
```

```{r}
# calculate usage by expression*genes coefs:
gep_scores = py$gep_scores
all_metagenes= py$usage_norm
```

## Enrichment analysis by top 200 genes of each program
```{r fig.height=8, fig.width=8, results='hide'}
plt_list = list()
for (i in 1:ncol(gep_scores)) {
  top_genes = gep_scores  %>%  arrange(desc(gep_scores[i])) #sort by score a
  top = head(rownames(top_genes),200) #take top top_genes_num
  res = genes_vec_enrichment(genes = top,background = rownames(gep_scores),homer = T,title = 
                    i,silent = T,return_all = T)
   
  plt_list[[i]] = res$plt
}
gridExtra::grid.arrange(grobs = plt_list)
```

```{r fig.height=10, fig.width=10}
# Make metagene names
for (i in 1:ncol(all_metagenes)) {
  colnames(all_metagenes)[i] = "metagene." %>% paste0(i)
}

#add each metagene to metadata
for (i in 1:ncol(all_metagenes)) {
  metage_metadata = all_metagenes %>% select(i)
  acc1_cancer_cells = AddMetaData(object = acc1_cancer_cells,metadata = metage_metadata)
}

FeaturePlot(object = acc1_cancer_cells,features = colnames(all_metagenes))

```
## assignment UMAP
```{r}
larger_by = 2
message(paste("larger_by = ", larger_by))
acc1_cancer_cells = program_assignment(dataset = acc1_cancer_cells,larger_by = larger_by,program_names = colnames(all_metagenes))
selected_k = py$selected_k
colors =  rainbow(selected_k)
colors = c(colors,"grey")
DimPlot(acc1_cancer_cells,group.by = "program.assignment",pt.size = 2,cols =colors)
``` 

## Comparisions
```{r}
cnv_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("cnv.cluster","hpv33_positive"))
test <- fisher.test(table(cnv_vs_hpv))
ggbarstats(
  cnv_vs_hpv, cnv.cluster, hpv33_positive,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

```

```{r}
cnv_vs_cnmf = FetchData(object = acc1_cancer_cells,vars = c("program.assignment","cnv.cluster"))
cnv_vs_cnmf = cnv_vs_cnmf %>% dplyr::filter(program.assignment == "1" |program.assignment == "2" )
test <- fisher.test(table(cnv_vs_cnmf))
ggbarstats(
  cnv_vs_cnmf, program.assignment, cnv.cluster,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

```

```{r}
cnmf_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("program.assignment","hpv33_positive"))
cnmf_vs_hpv = cnmf_vs_hpv %>% dplyr::filter(program.assignment == "1" |program.assignment == "2" )
test <- fisher.test(table(cnmf_vs_hpv))
ggbarstats(
  cnmf_vs_hpv, program.assignment, hpv33_positive,
  results.subtitle = FALSE,
  subtitle = paste0(
    "Fisher's exact test", ", p-value = ",
    ifelse(test$p.value < 0.001, "< 0.001", round(test$p.value, 3))
  )
)

```

```{r}
myb_vs_cnv = FetchData(object = acc1_cancer_cells,vars = c("cnv.cluster","MYB"))
myb_vs_cnv $cnv.cluster = as.character(myb_vs_cnv $cnv.cluster )

ggboxplot(myb_vs_cnv, x = "cnv.cluster", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("1","2")))
```

```{r}
myb_vs_hpv = FetchData(object = acc1_cancer_cells,vars = c("hpv33_positive","MYB"))
myb_vs_hpv $hpv33_positive = as.character(myb_vs_hpv $hpv33_positive )

ggboxplot(myb_vs_hpv, x = "hpv33_positive", y = "MYB",
          palette = "jco",
          add = "jitter")+ stat_compare_means(method = "wilcox.test",comparisons = list(c("positive","negative")))
```



```{r}
hpvReads_vs_myb = FetchData(object = acc1_cancer_cells,vars = c("HPV33.reads","MYB"))
corr = cor(hpvReads_vs_myb$HPV33.reads,hpvReads_vs_myb$MYB)
print("correlation of MYB abd hpv33_reads:" %>% paste(corr %>% round(digits = 2)) )
```


